Setup

# Load packages
library("tidyverse")
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggforce")
library("knitr")
library("lubridate")
library("plotly")
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library("zoo")
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library("spls")
## Sparse Partial Least Squares (SPLS) Regression and
## Classification (version 2.3-2)
library("GGally")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library("corrplot")
## corrplot 0.95 loaded
library("plotly")
library("rgl")
library("scatterplot3d")
library("knitr")
# Read in data
cv <- read.csv("https://jmc108.github.io/hugen2073-textbook/exercises/4_amounts_and_tables/cv.csv")
lrr <- read.csv("/Users/jonathanchernus/Documents/Teaching/hugen2073-textbook/exercises/6_associations_and_trends/lrr.csv")
sc <- read.csv("/Users/jonathanchernus/Documents/Teaching/hugen2073-textbook/exercises/6_associations_and_trends/sc.csv")
pcs <- read.csv("/Users/jonathanchernus/Downloads/archive/genetic_data_train.csv")
util <- read_csv("/Users/jonathanchernus/Documents/Teaching/hugen2073-textbook/exercises/6_associations_and_trends/utility.csv")
## Rows: 8783 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): TYPE, DATE, COST, NOTES
## dbl  (1): USAGE (kWh)
## time (2): START TIME, END TIME
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Exercises

Line graph

The chunk below simulates genetic drift in two populations of different sizes (Ne=200 Ne=2000).

We start with an initial allele frequency of p0=0.3 and follow p for 20 generations (generation).

Each population is simulated 12 times, as reflected in the grouping variable pop. So, for example, Ne200_7 in the pop column means “replicate number 7 of the 200-person population.”

Inspect the data to make sure you understand the variables.

set.seed(2073)

# ---- Wright–Fisher drift simulator ----
simulate_drift <- function(p0, Ne, gens, n_pops, label_prefix = "pop") {
  # p0: starting allele frequency
  # Ne: effective population size (diploid)
  # gens: number of generations
  # n_pops: number of replicate populations
  
  map_dfr(seq_len(n_pops), function(i) {
    p <- numeric(gens + 1)
    p[1] <- p0
    for (g in 2:(gens + 1)) {
      # next generation allele count ~ Binomial(2Ne, p_prev)
      k <- rbinom(1, size = 2 * Ne, prob = p[g - 1])
      p[g] <- k / (2 * Ne)
    }
    tibble(
      pop = paste0(label_prefix, i),
      generation = 0:gens,
      p = p
    )
  })
}

# ---- realistic-ish parameters ----
p0   <- 0.30   # common-ish starting allele frequency
gens <- 60
n_pops <- 12

# Two scenarios: small vs large Ne to show volatility differences
d_small <- simulate_drift(p0 = p0, Ne = 200,  gens = gens, n_pops = n_pops, label_prefix = "Ne200_")
d_large <- simulate_drift(p0 = p0, Ne = 2000, gens = gens, n_pops = n_pops, label_prefix = "Ne2000_")

d <- bind_rows(
  d_small %>% mutate(Ne = 200),
  d_large %>% mutate(Ne = 2000)
)

kable(head(d))
pop generation p Ne
Ne200_1 0 0.3000 200
Ne200_1 1 0.2600 200
Ne200_1 2 0.2925 200
Ne200_1 3 0.2600 200
Ne200_1 4 0.3175 200
Ne200_1 5 0.3450 200

Now make a line graph of the allele frequency over time:

  • facet by population size (Ne), in such a way that one plot is on top of the other (rather than side by side)

  • draw a line for each replicate, but make them slightly translucent with alpha (color them all the same; what aesthetic should you use instead of color?)

  • include + theme_classic() to remove the plot background

  • but add a horizontal dotted line to show the initial allele frequency of p0=0.3 (what geom do you need? Include linetype="dashed")

# Your code here
ggplot(d, aes(x = generation, y = p, group = pop)) +
  geom_hline(yintercept = p0, linetype = "dashed") +
  geom_line(alpha = 0.6) +
  facet_wrap(~Ne, ncol = 1) +
  coord_cartesian(ylim = c(0, 1)) +
  labs(
    title = "Genetic drift (Wright–Fisher) in replicate populations",
    subtitle = "Same starting allele frequency, different effective population sizes",
    x = "Generation",
    y = "Allele frequency (p)"
  ) +
  theme_classic()

Bonus: vibe-coded interactive line graph

Your instructor thought his electricity bill looked high, so he downloaded a year’s data from the utility company.

This code produces an interactive bar graph of electricity usage every hour, colored by hourly electricity cost.

No need to change anything here - just play with the interactive plot for a few moments and check out how surprisingly simple the plotting code is.

d2 <- util %>%
  mutate(
    datetime = as.POSIXct(as.Date(DATE, format = "%m/%d/%y"), tz = "America/New_York") +
      as.numeric(`START TIME`),
    cost = readr::parse_number(COST)
  ) %>%
  arrange(datetime)

p <- plot_ly(
  data = d2,
  x = ~datetime,
  y = ~`USAGE (kWh)`,
  type = "bar",
  customdata = ~cost,
  hovertemplate = paste(
    "%{x}<br>",
    "Usage: %{y:.2f} kWh<br>",
    "Cost: $%{customdata:.2f}",
    "<extra></extra>"
  ),
  marker = list(
    color = ~cost,                 # <- continuous color mapping
    showscale = TRUE,              # <- colorbar legend
    colorbar = list(title = "Cost ($)")
  )
) %>%
  layout(
    title = "Hourly Usage (kWh), Colored by Cost ($)",
    xaxis = list(
      title = "Time",
      type = "date",
      rangeslider = list(visible = TRUE)
    ),
    yaxis = list(title = "Usage (kWh)")
  )

p

Smoothing

The dataset lrr (read in above) contains LRR values on chromosome 22 for a single scan.

Make a scatterplot of the data. Then use the function rollmean from the zoo package to calculate moving average, and plot it as a connected line. Include the option fill=NA. Vary the k parameter to see if you can find one that does a good job of pinpointing the deletion. (Change xlim() if that helps.)

lrr %>% 
  mutate(avg=rollmean(lrr, k=5, align="center", fill=NA)) %>% 
  ggplot() +
  geom_point(aes(x=pos/10^6, y=lrr)) +
  geom_line(aes(x=pos/10^6, y=avg)) +
  xlab("Chr 22 (Mb)") +
  xlim(c(15,25))
## Warning: <ggplot> %+% x was deprecated in ggplot2 4.0.0.
## ℹ Please use <ggplot> + x instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 732 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 734 rows containing missing values or values outside the scale range
## (`geom_line()`).

Try the same thing with a LOESS curve (geom_smooth(method="loess")) instead of rollmean. Turn off the error bar, and vary the span parameter.

lrr %>% 
  mutate(avg=rollmean(lrr, k=5, align="center", fill=NA)) %>% 
  ggplot() +
  geom_point(aes(x=pos/10^6, y=lrr)) +
  geom_smooth(aes(x=pos/10^6, y=avg),
              se=FALSE,
              method="loess",
              span=0.02) +
  xlab("Chr 22 (Mb)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).

Overplotting

Now, let’s try making a scatterplot of gene expression in two samples in the sc.csv dataset (read in above). Pick two samples and make a scatterplot of the expression of all the genes. Apply the log2(x+1) transformation first. (Don’t use sample1 and sample2, as they’re so similar that they make for a boring plot.)

How many points are in the plot? How many points do you think you can see in the plot?

In your first plot, try using the alpha parameter to control overplotting.

# Your code here
sc %>%
  ggplot() +
  geom_point(aes(x=log2(sample1+1),y=log2(sample5+1)),
             alpha=0.1)

Now try a few more ways of addressing the overplotting problem.

First, try out geom_bin2d and geom_hex

  • Try filtering out genes that are unexpressed in either sample

  • Also try adjusting bin parameters

sc %>%
  ggplot() +
  geom_bin2d(aes(x=log2(sample1+1),y=log2(sample5+1)))
## `stat_bin2d()` using `bins = 30`. Pick better value `binwidth`.

sc %>%
  ggplot() +
  geom_hex(aes(x=log2(sample1+1),y=log2(sample5+1)), bins=30)

sc %>%
  filter(sample1 > 0 & sample2 > 0) %>% 
  ggplot() +
  geom_bin2d(aes(x=log2(sample1+1),y=log2(sample5+1)), bins=50)

sc %>%
  filter(sample1 > 0 & sample2 > 0) %>% 
  ggplot() +
  geom_hex(aes(x=log2(sample1+1),y=log2(sample5+1)), bins=50)

Now try two versions of a 2D density plot: geom_density_2d and geom_density_2d_filled. Notice the default plot is not helpful without filtering lowly-expressed genes. Try putting the two geoms in a single plot and overlaying a scatterplot.

Note there are some more variations here.

sc %>%
  filter(sample1 > 0 & sample2 > 0) %>% 
  ggplot() +
  geom_density_2d_filled(aes(x=log2(sample1+1),y=log2(sample5+1))) +
  geom_density_2d(aes(x=log2(sample1+1),y=log2(sample5+1))) +
  geom_point(aes(x=log2(sample1+1),y=log2(sample5+1)), alpha=0.1)

Connected scatterplots

If we have two variables that change with time, we could plot a line graph of each. Here, gene-expression measurements from synchronized cells are summarized by two principal components.

data(yeast)
# genes x timepoints
dim(yeast$y)
## [1] 542  18
# transpose so rows = timepoints
pca <- prcomp(t(yeast$y), scale. = TRUE)
pca_df <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  idx = seq_len(nrow(pca$x))  # implicit time index
)
pca_df %>% 
  ggplot() +
  geom_point(aes(x=idx, y=PC1)) + geom_line(aes(x=idx, y=PC1), color="red") +
  geom_point(aes(x=idx, y=PC2)) + geom_line(aes(x=idx, y=PC2), color="blue") +
  ylab("PC1 (red) and PC2 (blue)")+
  ggtitle("Two PCs over time")

Instead, we could look at how PC1 and PC2 relate to each other directly. Make a connected scatterplot to show this.

  • Try geom_line - why doesn’t it work?

  • Use geom_path instead.

  • Try including arrow=arrow().

# Your code here
ggplot(pca_df, aes(PC1, PC2)) +
  geom_path(aes(color=idx), arrow=arrow()) +
  geom_point()

Many variables

We can use the ggpairs function from GGally to make pairwise plots of many variables.

Try using ggpairs to explore expression in 5 samples in the sc dataset. log2(x_1)-transform the variables and filter out non-expressed genes first.

samples <- c("sample1", "sample2", "sample3", "sample4", "sample5")
sc %>% 
  select(all_of(samples)) %>% 
  filter(rowSums(sc[, samples] <= 0) == 0) %>%
  mutate(across(everything(), function(x) log2(x + 1))) %>%
  ggpairs()

Correlations and heatmaps

Now try using the corrplot function from the corrplot package.

  • Try using the same code you used in the chunk above to filter and process the data

  • This time, use all the samples

  • Pipe it into the cor function to calculate correlations

  • Pipe the correlation matrix into corrplot

  • Experiment with different values of the method argument (or use the corrplot.mixed function and separately specify lower and upper; note the default plotting parameters are not wonderful)

sc %>% 
  filter(rowSums(sc[, samples] <= 0) == 0) %>%
  mutate(across(everything(), function(x) log2(x + 1))) %>%
  cor() %>% 
  corrplot.mixed(corr=., upper="ellipse", lower="number")

3D scatterplot

There are several ways to make 3D scatterplots in R. These include

  • plotly::plot_ly(type="scatter3d", mode="markers")

  • rgl::plot3d

  • scatterplot3d:scatterplot3d (only a 2D projection)

Try using at least one of the above to make a scatterplot of the first 3 PCs of ancestry in the dataset pcs. Color the points by Ancestry. You will probably need to check the documentation of the plotting function you use (hint: look for examples at the bottom of the help page).

# Your code here
plot_ly(pcs, x=~PC1, y=~PC2, z=~PC3, type="scatter3d", mode="markers", color=pcs$Ancestry)
plot3d(pcs$PC1, pcs$PC2, pcs$PC3, col=as.numeric(factor(pcs$Ancestry)))
scatterplot3d(pcs$PC1, pcs$PC2, pcs$PC3, color=as.numeric(factor(pcs$Ancestry)))

Parallel set/alluvial plot

Let’s make a parallel set-style plot to show how the categorical variables Type and clinsig are related in the cv dataset.

There are several packages for these kinds of plots, and they all seem to be a little quirky. We will try geom_parallel_sets from the ggforce package.

Formatting

First we need to use some helper functions to get the data in shape for geom_parallel_set.

Currently, the data look like this:

# Apply head(cv) here

We need to aggregate the cross-category counts and convert that into parallel sets format.

Copy this code and run it in the chunk below:

# Aggregate counts
d_counts <- cv %>%
  count(Type, clinsig, name = "n")

# Convert to parallel-sets format
d_ps <- gather_set_data(d_counts, c("Type", "clinsig"))

head(d_counts)
head(d_ps)
# Aggregate counts
d_counts <- cv %>%
  count(Type, clinsig, name = "n")
head(d_counts)
##          Type                clinsig    n
## 1    Deletion                 Benign  181
## 2    Deletion          Likely benign  290
## 3    Deletion      Likely pathogenic  470
## 4    Deletion             Pathogenic 8610
## 5    Deletion Uncertain significance  674
## 6 Duplication                 Benign  126
# Convert to parallel-sets format
d_ps <- gather_set_data(d_counts, c("Type", "clinsig"))
head(d_ps)
##          Type                clinsig    n id    x           y
## 1    Deletion                 Benign  181  1 Type    Deletion
## 2    Deletion          Likely benign  290  2 Type    Deletion
## 3    Deletion      Likely pathogenic  470  3 Type    Deletion
## 4    Deletion             Pathogenic 8610  4 Type    Deletion
## 5    Deletion Uncertain significance  674  5 Type    Deletion
## 6 Duplication                 Benign  126  6 Type Duplication

Plotting

Now we’re ready to plot

  • Use ggplot

    • with d_ps as the input data frame

    • and aes(x, id = id, split = y, value = n)

  • Also include these three geoms

    • geom_parallel_sets

    • geom_parallel_sets_axes

    • geom_parallel_sets_labels

  • To improve the plot:

    • adjust alpha in one of the layers (which?)

    • adjust size in one of the layers (which?)

    • include axis.width = 0.25, color = "grey30", fill = "grey95" in one of the layers (which?)

    • try coloring the ribbons by either Type or clinsig (which layer?)

# Plot: straight-sided parallel sets
ggplot(d_ps, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill=Type), alpha=0.7) +
  geom_parallel_sets_axes(axis.width = 0.25,
                          color = "grey30",
                          fill = "grey95") +
  geom_parallel_sets_labels(size = 3)